In [1]:
#params
DOCS_FOLDER = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/ProcessedDocs"
FILE_MASK = ".*\.txt"

MIN_DOC_FREQ = 30
MAX_PHRASE_LEN = 4
STOP_WORDS_FILE = "/Users/simon.hughes/GitHub/analytics-python/SolrlikeAnalysisPipeline/data/dice_stop_words.txt"
PHRASES_FILE    = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/Phrases.txt"
PHRASE_FREQ_FILE= '/Users/simon.hughes/Documents/Dice Data/LuceneTalk/phrases_freq.txt'

In [2]:
import time
full_start = time.time()

In [3]:
import re
import numpy as np
import time
import hashlib
from collections import defaultdict

re_collapse_spaces = re.compile("\s+")

def collapse_spaces(s):
    return re_collapse_spaces.sub(" ", s).strip()

re1 = re.compile("[;:\'\"\*/\),\(\|\s]+")
def clean_str(s):
    s = str(s).replace("'s"," ")
    #doesn't work in regex
    s = s.replace("-", " ").replace("\\"," ")
    s = re1.sub(" ",s).strip()
    return collapse_spaces(s)

def compute_ngrams(tokens, max_len = None, min_len = 1):
    """
    tokens  :   iterable of string
                    a single sentence of tokens. Assumes start and stop tokens omitted
    max_len :   int
                    maximum ngram length
    min_len :   int
                    minimum ngram length

    """
    if max_len == None:
        max_len = len(tokens)

    if min_len > max_len:
        raise Exception("min_len cannot be more than max_len")

    ngrams = set()
    # unigrams
    for ngram_size in range(min_len, max_len + 1):
        for start in range(0, len(tokens) - ngram_size + 1):
            end = start + ngram_size -1
            words = []
            for i in range(start, end + 1):
                words.append(tokens[i])
            ngrams.add(tuple(words)) # make a tuple so hashable
    return ngrams

# is a valid token
__bad_chars__ = "<>{}[]~@"
__punct__ = set(".?!,;:")
def is_valid_term(term):
    # remove single char entries and only numeric
    if len(term) == 0:
        return False
    if len(term) == 1:
        #else misses c and r
        if term.isalpha():
            return True
        return False
    # no html or js terms
    for c in __bad_chars__:
        if c in term:
            return False
    if term[-1] in __punct__:
        return False
    if "function(" in term:
        return False
    if "!" in term or "?" in term:
        return False
    digit_chars = 0.0
    for c in term:
        if c.isdigit() or not c.isalnum():
            digit_chars += 1.0
    # 60% digits?
    if (digit_chars / len(term)) >= 0.75:
        return False
    return True

re1 = re.compile("[;:\'\"\*/\),\(\-\|\s]+")

# we may want to keep some non-alpha characters, such as # in C# and + in C++, etc.
def remove_punct(s):
    s = s.replace("'s"," ")
    return collapse_spaces(re1.sub(" ",s).strip())

def hash_string(s):
    hash_object = hashlib.md5(b'%s' % s)
    return str(hash_object.hexdigest())

def find_files(folder, regex, remove_empty = False):
    """
    Find all files matching the [regex] pattern in [folder]

    folder  :   string
                    folder to search (not recursive)
    regex   :   string (NOT regex object)
                    pattern to match
    """
    files = os.listdir(folder)
    matches = [os.path.abspath(os.path.join(folder, f))
               for f in files
               if re.search(regex, f, re.IGNORECASE)]

    if remove_empty:
        matches = [f for f in matches if os.path.getsize(f) > 0]
    matches.sort()
    return matches

Load Stop Words


In [4]:
# load stop words
def load_stop_words(stop_words_file):
    stop_words = set()
    with open(stop_words_file) as f:
            for line in f:
                word = line.strip()
                if word[0] != "#":
                    word = word.lower()
                    stop_words.add(word)
    return stop_words

if STOP_WORDS_FILE:
    stop_words = load_stop_words(STOP_WORDS_FILE)
    print("%i stop words loaded" % len(stop_words))
else:
    stop_words = set()


1445 stop words loaded

Get Files


In [6]:
import os, re
start = time.time()

files = find_files(DOCS_FOLDER, FILE_MASK, True)
print("%s files found in %s" % (len(files), DOCS_FOLDER))
documents = []
for i, fname in enumerate(files):
    with open(fname) as f:
        contents = f.read()
        documents.append(contents.split("\n"))
end = time.time()
print("Loading %i documents took %s seconds" % (len(files), str(end - start)))


66989 files found in /Users/simon.hughes/Documents/Dice Data/LuceneTalk/ProcessedDocs
Loading 66989 documents took 14.0081241131 seconds

Extract Common Terms and Phrases


In [7]:
start = time.time()
#Or use a counter here.
doc_freq = defaultdict(int)

# remove short docs
tokenized_docs = []
sent_id = 0
sent_ids = set()
lens = []
hashed = set()

for doc in documents:
    un_tokens = set()
    tok_sents = []
    for sent in doc:
        cl_sent = remove_punct(sent.lower())
        hash_sent = hash_string(cl_sent)
        # remove dupe sentences (not - will hurt df accuracy a little)
        if hash_sent in hashed:
            continue
        hashed.add(hash_sent)

        tokens = tuple(cl_sent.split(" "))
        lens.append(len(tokens))
        sent_id += 1
        tok_sents.append((sent_id, tokens))
        sent_ids.add(sent_id)
        
        # create inverted index and unique tokens (for doc freq calc)
        proc_tokens = set()
        for tok in tokens:
            if not tok in proc_tokens:
                proc_tokens.add(tok)
                if not tok in un_tokens:
                    un_tokens.add(tok)                    
                    doc_freq[tok] += 1
        
    if len(tok_sents) > 0:
        tokenized_docs.append(tok_sents)

end = time.time()
print end-start, "secs"
len(tokenized_docs), "docs", len(doc_freq), "unique tokens"


55.7478640079 secs
Out[7]:
(62696, 'docs', 137661, 'unique tokens')

In [8]:
# Get really frequent items for removal
num_docs = float(len(tokenized_docs))
above_threshold = [k for k,v in doc_freq.items() if v >= MIN_DOC_FREQ]

# remove really frequent terms (in 50% or more of documents)
too_freq = set([k for k in above_threshold if (doc_freq[k]/num_docs) >= 0.50])

freq_terms = [k for k in above_threshold 
              if  k not in stop_words and 
                  k not in too_freq and 
                  is_valid_term(k)]
print("%s frequent terms identified for building phrases" % len(freq_terms))


8455 frequent terms identified for building phrases

In [9]:
import time

start = time.time()

# Find all phrases up to length MAX_PHRASE_LEN at or above the defined MIN_DOC_FREQ above
phrase_doc_freq = defaultdict(int)
for term in freq_terms:
    phrase_doc_freq[tuple([term])] = doc_freq[term]
    
# data structure is a list of list (document) of pairs - sentences: (int, list  (of tokens))
# each item is a doc, a list of sents. each sent is a list of valid remaining phrases
# seed with one valid phrase per sent

#working_docs = [map(lambda sent: [sent], d) for d in tokenized_docs]
working_docs = [map(lambda (sid, sent): (sid, [sent]), d) for d in tokenized_docs]
working_freq_terms = set(freq_terms)

# sentences with one or more phrases that are frequent enough (under the apriori algorithm closure priniciple)
working_sent_ids = set(sent_ids)
# don't bother whitling this down further at the start, almost all sentences have at least on freq term in them

for phrase_len in range(2, MAX_PHRASE_LEN + 1):
    phrase_start = time.time()
    print "phrase_len", phrase_len
    print len(working_docs), "docs", len(working_freq_terms), "terms", len(working_sent_ids), "sentences"
    # for current phrase_len
    current_phrase_doc_freq = defaultdict(int)
    
    # used to look up sentence ids by phrase
    phrase2sentids = defaultdict(set)
    
    new_work_docs = []
    for doc in working_docs:
        new_work_sents = []
        unique_potential_phrases = set()
        for sent_id, phrases in doc:
            if sent_id not in working_sent_ids:
                continue
            
            new_work_phrases = []
            for phrase in phrases:
                current_phrase = []
                for term in phrase:
                    if term in working_freq_terms:
                        current_phrase.append(term)
                    else:
                        if len(current_phrase) >= phrase_len:
                            new_work_phrases.append(current_phrase)
                        current_phrase = []
                
                if len(current_phrase) >= phrase_len:
                    new_work_phrases.append(current_phrase)
            
            if len(new_work_phrases) > 0:
                for phrase in new_work_phrases:
                    ngrams = compute_ngrams(phrase, phrase_len, phrase_len)
                    for tpl_ngram in ngrams:                        
                        unique_potential_phrases.add(tpl_ngram)
                        phrase2sentids[tpl_ngram].add(sent_id)
                        
                new_work_sents.append((sent_id, new_work_phrases))
        
        # end for sent in doc
        # for doc freq, need to compute unique phrases in document
        for unique_tpl_phrase in unique_potential_phrases:
            current_phrase_doc_freq[unique_tpl_phrase] +=1
        
        if len(new_work_sents) > 0:
            new_work_docs.append(new_work_sents)

    new_working_freq_terms = set()
    new_working_sent_ids = set()
    for tuple_phrase, freq in current_phrase_doc_freq.items():
        if freq < MIN_DOC_FREQ:
            continue            
        phrase_doc_freq[tuple_phrase] = freq
        new_working_sent_ids.update(phrase2sentids[tuple_phrase])
        for tok in tuple_phrase:
            new_working_freq_terms.add(tok)
    
    if len(new_working_freq_terms) <= 1 or len(new_work_docs) == 0 or len(new_working_sent_ids) == 0:
        break
    working_docs = new_work_docs
    working_freq_terms = new_working_freq_terms
    working_sent_ids = new_working_sent_ids
    phrase_end = time.time()
    print "\t", len(phrase_doc_freq), "phrases found"
    print "\ttook", phrase_end - phrase_start, "seconds"
    
print ""
end = time.time()
print "took", end - start, "seconds"
print len(phrase_doc_freq), "phrases found"


phrase_len 2
62696 docs 8455 terms 1213706 sentences
	22957 phrases found
	took 33.1819739342 seconds
phrase_len 3
61803 docs 3130 terms 772061 sentences
	25250 phrases found
	took 14.6069819927 seconds
phrase_len 4
59772 docs 1066 terms 132253 sentences
	25579 phrases found
	took 3.33791399002 seconds

took 56.7129700184 seconds
25579 phrases found

Find Sub-Phrases that Are Of Near Equivalent Frequencies and REMOVE


In [10]:
# there are a lot of short phrases that always or nearly always have the same DF as longer phrases that they constitute

def find_sub_phrases_to_remove(tpl_phrase, valid_phrases, doc_freq, to_rem):
    if len(tpl_phrase) <= 1:
        return
    phrase_df = doc_freq[tpl_phrase]
    ngrams = compute_ngrams(tpl_phrase, len(tpl_phrase)-1, 1)
    for tpl_ngram in ngrams:
        if tpl_ngram in valid_phrases and tpl_ngram not in to_rem:
            sub_phr_df = doc_freq[tpl_ngram]
            # if sub_phrase_df is close to the same frequency
            if phrase_df >= (0.9 * sub_phr_df):
                to_rem.add(tpl_ngram)
                #to_rem_dbg.add((tpl_phrase, tpl_ngram, phrase_df, sub_phr_df))
                find_sub_phrases_to_remove(tpl_ngram, valid_phrases, doc_freq, to_rem)

# don't process unigrams
valid_phrases = set(phrase_doc_freq.keys())
phrases = filter(lambda k: len(k) > 1, phrase_doc_freq.keys())
to_remove = set()

for tpl_key in sorted(phrases, key = lambda k: -len(k)):
    if tpl_key not in to_remove:
        phrase_df = phrase_doc_freq[tpl_key]
        # find all shorter sub-phrases
        find_sub_phrases_to_remove(tpl_key, valid_phrases, phrase_doc_freq, to_remove)

len(to_remove)#, len(to_remove_dbg)


Out[10]:
793

In [11]:
#Dump phrases to file
with open(PHRASES_FILE, "w+") as f:
    for tpl in sorted(phrase_doc_freq.keys()):
        # phrases only
        if tpl not in to_remove:
            joined = " ".join(tpl)
            f.write(joined + "\n")
    
full_end = time.time()
print("Whole process took %s seconds" % str(full_end - full_start))


Whole process took 127.771396875 seconds

In [12]:
with open(PHRASE_FREQ_FILE, "w+") as f:
    for tpl,v in sorted(phrase_doc_freq.items(), key = lambda (k,v): -v):
        # phrases only
        if tpl not in to_remove:
            joined = " ".join(tpl)
            f.write(joined + "," + str(v) + "\n")